********************************************************************************
clear all
set more off
capture log close

********************************************************************************
** File paths
** Set the current directory to the same folder where the code is

global input "../input" 
global output "../output" 
global temp "../temp"

********************************************************************************
** Start log, save it into temp folder
log using "$temp/get_data_quality", replace

********************************************************************************
** Get ids
********************************************************************************
use $temp/pagelength_users_edits, clear

sum vPageAge if vMonth == tm(2014m8)

keep if vMonth == tm(2014m8)
rename vPageAge vPageAge2014Aug

keep vPage vLanguage vTreatmentGroup vGroup vPageAge

duplicates drop 

tabulate vLanguage vTreatmentGroup

duplicates report vPage vLanguage

save $temp/temp_pageids, replace

********************************************************************************
** Get watchers
********************************************************************************
import delimited using $input/watchers.csv, varnames(1) clear
reshape long watchers_ , i(wiki_item) j(vLanguage) string
rename watchers_ watchers
rename wiki_item vPage
tabulate vLanguage
save $temp/watchers, replace

********************************************************************************
** Get Wikipedia offical quality
********************************************************************************
import delimited using $input/quality_wikipedia_offical_english_ratings.csv, varnames(1) clear

gen vWPQuality = wp_spain_class
replace vWPQuality = wp_cities_class if vWPQuality ==""
replace vWPQuality = "NA" if vWPQuality ==""
tabulate vWPQuality

gen vWPImportance = wp_spain_importance
replace vWPImportance = wp_cities_importance if vWPImportance ==""
replace vWPImportance = "NA" if vWPImportance ==""
tabulate vWPImportance

rename wiki_item vPage 
keep vPage vWP*

save $temp/quality_wikipedia_offical, replace

********************************************************************************
** Page length in English and Spanish
********************************************************************************
import delimited using $input/pagelength_alllanguages.csv, varnames(1) clear
tabulate lang 
rename lang vLanguage
rename page vPage

gen vMonth = mofd(date(date,"YMD"))
format vMonth %tm
list vMonth date in 1/10
drop date

keep vLanguage vPage vMonth length
rename length vLengthNew

** Check with the main file that no differences
merge 1:1 vLanguage vPage vMonth using $temp/pagelength, keepusing(length)
drop if _merge ==2
drop _merge
gen tempCheck = length - vLengthNew 
sum tempCheck
drop length
drop temp*
rename vLengthNew length
 
drop if vLanguage == "nl"
drop if vMonth<tm(2014m8)
tabstat length if vMonth == tm(2018m9), by(vLanguage) stats(n)
tabstat length if vMonth == tm(2014m9), by(vLanguage) stats(n)

gen temp2018Sep_En = length if vMonth == tm(2018m9) & vLanguage == "en"
gen temp2018Sep_Es = length if vMonth == tm(2018m9) & vLanguage == "es"
egen vLength_2018SepEn = mean(temp2018Sep_En), by(vPage) 
egen vLength_2018SepEs = mean(temp2018Sep_Es), by(vPage) 
drop temp*

keep if vMonth == tm(2014m8) | vMonth == tm(2014m9)
keep if vLanguage == "it" | vLanguage == "de" | vLanguage == "fr"
tabulate vLanguage
gen vMonthString = "Sep" if vMonth == tm(2014m9)
replace vMonthString = "Aug" if vMonth == tm(2014m8)
tabulate vMonth vMonthString
drop vMonth
reshape wide length , i(vPage vLanguage) j(vMonthString) string
sum length*

gen vRelatLength_2014Aug_En = 100*lengthAug/vLength_2018SepEn
gen vRelatLength_2014Aug_Es = 100*lengthAug/vLength_2018SepEs

gen vRelatLength_2014Sep_En = 100*lengthSep/vLength_2018SepEn
gen vRelatLength_2014Sep_Es = 100*lengthSep/vLength_2018SepEs

sum vRelatLength*

keep vPage vLanguage vRelat*

compress

save $temp/temp_relative_length, replace

********************************************************************************
** Tversky similarity, Spanish
********************************************************************************
import delimited using $input/tversky_es.csv, varnames(1) clear 
rename wiki_item vPage 
rename  lang vLanguage
tabulate year vLanguage
sum similarity
gen tempSimilarity2014Aug = similarity if month =="aug" & year == 2014	
sum tempSimilarity2014Aug
egen vSimilarity_2014Aug = mean(tempSimilarity2014Aug), by(vLanguage vPage)
** Check
gen tempCheck = vSimilarity_2014Aug - similarity if month =="aug" & year == 2014	
sum tempCheck
drop temp*
** Keep only the september revision for 2014 
tabulate year month
drop if month == "aug"
tabulate year month
tabulate year vLanguage
drop month

rename similarity vSimilarity_September

** Merge to get treatment information
merge m:1 vLanguage vPage using $temp/temp_pageids, keepusing(vTreatmentGroup)
keep if _merge ==3
drop _merge
tabulate year vLanguage

** Calculate how much was added in Aug 2014
gen tempTreatmentAdded = vSimilarity_September - vSimilarity_2014Aug if year == 2014 
egen vAdded2014Aug = mean(tempTreatmentAdded), by(vPage vLanguage)
sum vAdded2014Aug, detail
tabstat vAdded2014Aug, by(vTreatmentGroup) stat(min p25 p50 p75 max mean n)
drop temp*

keep vLanguage vPage year vSimilarity_September vSimilarity_2014Aug vTreatmentGroup vAdded2014Aug
rename vSimilarity_September vSimilarity
reshape wide vSimilarity, i(vLanguage vPage) j(year)

gen vDifSimilarity_20182014 = vSimilarity2018 - vSimilarity2014

rename vAdded2014Aug vDifSimilarity_2014SepAug
rename vSimilarity2014 vSimilarity_2014Sep

keep vDifSimilarity* vSimilarity_2014Aug vSimilarity_2014Sep vLanguage vPage vTreatmentGroup 

save $temp/quality_similarity_cross_section, replace

********************************************************************************
** Tversky similarity, English 2014
********************************************************************************
import delimited using $input/tversky_en.csv, varnames(1) clear 
rename wiki_item vPage 
rename  lang vLanguage
tabulate year vLanguage
sum similarity
tabulate year month
reshape wide similarity, i(vPage vLanguage) j(month) string
tabulate year vLanguage
drop year
rename similarityaug vSimilarityEng_2014Aug
rename similaritysept vSimilarityEng_2014Sep
sum vSim* 
save $temp/quality_similarity_cross_section_english, replace

********************************************************************************
** Get manual ratings
********************************************************************************
foreach i in 69ee8c 903c9a ad000b 1af85e 17d3b2 400b4c {
	import delimited using $input/quality_manualratings/save_`i'.csv, varnames(1) clear
	if (rater ! = "69ee8c") {
		append using $temp/quality_manualratings
	}
	save $temp/quality_manualratings, replace
}

rename id vPage
rename lang vLanguage
tabulate vLanguage

** Take average across 2 raters
collapse (mean) completeness* overall* interesting* wellwritten* illustrated*, by(vPage vLanguage)

** Merge with page ids
merge m:1 vLanguage vPage using $temp/temp_pageids , keepusing(vTreatmentGroup)
keep if _merge ==3
drop _merge

** Generate data
foreach p in completeness overall interesting wellwritten illustrated {
	** Raw differences
	gen vDif_`p'_21 = `p'_2 - `p'_1
	gen vDif_`p'_32 = `p'_3 - `p'_2
}

order vPage vLanguage vTreatmentGroup
tabulate  vLanguage vTreatmentGroup

save $temp/quality_manualratings, replace

********************************************************************************
** Merge cross-section data together
********************************************************************************
use $temp/pagelength_users_edits, clear
keep if vLanguage == "it" | vLanguage == "fr" | vLanguage == "de" | vLanguage == "nl"

** Note: change here from vLogLength to vLogLengthMinusTreatment
gen tempLogLength2014Sep_1 = vLogLengthMinusTreatment if vMonth == tm(2014m9)
gen tempLogLength2018Sep_1 = vLogLengthMinusTreatment if vMonth == tm(2018m9)
egen tempLogLength2014Sep_2 = mean(tempLogLength2014Sep_1), by(vPage vLanguage)
egen tempLogLength2018Sep_2 = mean(tempLogLength2018Sep_1), by(vPage vLanguage)
gen vDifLogLength_20182014 = tempLogLength2018Sep_2 - tempLogLength2014Sep_2
gen tempCheck = tempLogLength2018Sep_2 - vLogLengthMinusTreatmentYr4
sum tempCheck, detail
tabstat tempCheck, by(vTreatmentGroup) stat(p25 p50 p75 mean n)
drop temp*
sum vDifLogLength_20182014, detail
tabstat vDifLogLength_20182014, by(vTreatmentGroup) stat(p25 p50 p75 mean n)

keep vPage vLanguage vTreatmentGroup vNumPage vNumLanguage vNumPageID vLogLength2014Aug vAverUsersPreTreatment vAverEditDaysPreTreatment vAverEditDistPreTreatment vAverEditDistDCapPreTreatment vLogLengthMinusTreatmentYr4 vDifLogLength_20182014

drop if vLogLengthMinusTreatmentYr4==.
tabulate vLanguage vTreatmentGroup

merge 1:1 vPage vLanguage using $temp/quality_similarity_cross_section
drop _merge

merge 1:1 vPage vLanguage using $temp/quality_manualratings
drop _merge

merge 1:1 vPage vLanguage using $temp/quality_similarity_cross_section_english
drop _merge

merge 1:1 vPage vLanguage using $temp/temp_relative_length
drop _merge

merge 1:1 vPage vLanguage using $temp/temp_pageids
drop _merge

label variable overall_1 "Quality rating before treatment" 
label variable completeness_1 "Quality: complete before treatment"
label variable interesting_1 "Quality: interesing before treatment"
label variable wellwritten_1 "Quality: well-written before treatment"
label variable illustrated_1 "Quality: illustrated before treatment"
label variable vSimilarity_2014Aug "Similarity to Spanish before treatment"
label variable vSimilarityEng_2014Aug "Similarity to English before treatment"

save $temp/data_cross_section, replace

********************************************************************************

erase $temp/temp_pageids.dta
erase $temp/quality_manualratings.dta
erase $temp/quality_similarity_cross_section.dta
erase $temp/quality_similarity_cross_section_english.dta
erase $temp/temp_relative_length.dta

********************************************************************************
log close 
clear
